import os
import numpy as np
import pandas as pd
from pplots_new import read_embeddings, plot_embedding, plot_embedding_interactive, rotate, get_colors
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
The user must provide a path to the input file in .mfasta format and path to the output directory for intermediate file storage:
path_to_PM="/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA"
mfasta = path_to_PM+"/examples/thioredoxins/thioredoxins.mfasta" # full path to the input MSA in mfasta format
path_out = path_to_PM+"/examples/thioredoxins_test" # a directory to write resutling files
out_name = "thioredoxins" # name given to the output files
path_to_figures = "/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/figures"
All scripts necessary for data preparation are located in scirpts/data_preparation:
path_prep_scripts = path_to_PM+"/scripts/prepare_data"
Data preparation consists in .mfasta cleaning according to a gap threshold and translation of each sequence to the PSSM profile:
gapth = "0.9" # threshold for filtering gapped positions
prep_parameters = path_prep_scripts+" "+mfasta+" "+path_out+" "+out_name+" "+gapth # parameters for data preparation
# print(prep_parameters)
print(path_prep_scripts+"/create_projection.sh " + prep_parameters)
/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/scripts/prepare_data/create_projection.sh /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/scripts/prepare_data /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/thioredoxins/thioredoxins.mfasta /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/thioredoxins_test thioredoxins 0.9
os.system(path_prep_scripts+"/create_projection.sh " + prep_parameters)
print("Output files ready for projection are written to: "+path_out+"/fasta"+gapth)
Output files ready for projection are written to: /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/thioredoxins_test/fasta0.9
You can change the parameters of the projection here:
knn = "5"
gamma = "2.00"
sigma = "1.00"
batchs = "4"
cospca = "0"
epochs = "1000"
seed = "1"
Then, the following command creates a projection of encoded sequences to a Poincaré disk:
path_to_build_PM = path_to_PM+"/scripts/build_poincare_map"
pm_command = "python "+path_to_build_PM+"/main.py --input_path "+path_out+"/fasta"+gapth+" --output_path "+path_out+"/projections/ --gamma "+gamma+" --pca "+ cospca+" --epochs "+epochs+" --seed "+seed
print(pm_command)
os.system(pm_command)
python /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/scripts/build_poincare_map/main.py --input_path /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/thioredoxins_test/fasta0.9 --output_path /home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/thioredoxins_test/projections/ --gamma 2.00 --pca 0 --epochs 1000 --seed 1
0
Parameters by default are provided in comments. The output files are then written to the following file:
path_embedding = path_out+"/projections/PM"+knn+"sigma="+sigma+"gamma="+gamma+"cosinepca="+cospca+"_seed"+seed+".csv"
print(path_embedding)
/home/lavande/galochkina/SCIENCE/POINCARE/PoincareMSA/examples/thioredoxins_test/projections/PM5sigma=1.00gamma=2.00cosinepca=0_seed1.csv
One can visualieze the resulting projection using any convenient coloring. To do so, the user shoud provide a .csv file with each line corresponding to a protein:
path_annotation = path_to_PM+"/visualization/data/thioredoxin_annotation.csv" # path to annotation file
thx_df = pd.read_csv(path_annotation, index_col = 0)
thx_df
| Species | Kingdom | Family | Color_eukaryota | |
|---|---|---|---|---|
| 0 | LBCA | LBCA | LBCA | OTHERS |
| 1 | LACA | LACA | LACA | OTHERS |
| 2 | AECA | AECA | AECA | OTHERS |
| 3 | LGPCA | LGPCA | LGPCA | OTHERS |
| 4 | LECA | LECA | LECA | OTHERS |
| ... | ... | ... | ... | ... |
| 205 | Escherichia | Bacteria | Proteobacteria | OTHERS |
| 206 | Shigella | Bacteria | Proteobacteria | OTHERS |
| 207 | Salmonella | Bacteria | Proteobacteria | OTHERS |
| 208 | Yersinia | Bacteria | Proteobacteria | OTHERS |
| 209 | Vibrio | Bacteria | Proteobacteria | OTHERS |
210 rows × 4 columns
A user can also create a custom color palette:
# construction of palette
# class species from thioredoxin annotation file into different kingdom
bacteria_specie = ["Bacteria", "LBCA", "LGPCA", "LPBCA", "Salmonella", "Shigella", "Escherichia", "Acidobacteria", "Deinococcus", "Thermus",
"Solibacter", "Helicobacter", "Geobacter", "Wolinella", "Neisseria", "Bordetella", "Brucella", "Rickettsia", "Campylobacter",
"Thiobacillus", "Burkholderia", "Vibrio", "Desulfovibrio", "Bdellovibrio", "Yersinia", "Agrobacterium", "Actinobacteria", "Thermobifida",
"Flavobacterium","Geobacterium", "Chloroflexus", "Aquifex", "Enterococcus","Chlamydia", "Chlamydophila", "Listeria", "Lactobacillus",
"Geobacillus", "Chlorobium", "Rhodopirrelula", "Clostridium", "Hyperthermus", "Porphyromonas", "Streptomyces",
"Syneschocystis", "Nostoc", "Thermosynechococus", "Prochlorococcus", "Nostoc1", "Sinorhizobium", "Pseudomonas", "Bacteroides", "Staphylococcus",
"Thermoanaerobacter","Bacillus", "Mycobacterium", "Streptococcus", "Dehalococcoides", "Synechocystis", "Corynebacterium", "Thermosynechococcus",
"Rhodopirellula"]
archae_specie = ["Archaea", "LACA", "Aeropyrum","Thermofilum","Caldivirga","Sulfolobus","Haloquadratum","Haloarcula", "Thermoplasma", "Hyperthermus",
"Natronomonas","Methanocorpusculum", "Methanococcus", "Halobacterium", "Picrophilus","Methanospirillum","Staphylothermus",
"Methanosaeta", "Metallosphaera","Methanococcoides", "Candidatus","Archaeoglobus"]
eukaryota_animal= ["Eukaryota", "LECA", "LAFCA", "Ovis","Bos", "Mus","Rattus","Rabit", "Human", "Ponab","Macmu","Ornithorhynchus","Gallus","Equus",
"Theileria","Danio","Tetraodon","Xenopus","Ictalurus","Ophiophagus","Callithrix","Monodelphis","Geocy","Sus","Melopsittacus", # animal
"Monosiga", # choanoflagellata
"Graphocephala", "Tribolium", "Apis", "Bombyx","Litopenaeus","Drosophila", # insecta
"Entamoeba", "Plasmodium","Cryptosporidium", "Dictyostelium" # unicellular parasite
]
eukaryota_viridiplantae = ["Helicosporidium", "Ostreococcus", "Fagopyrum"]
eukaryota_mitochondrion = ["Bovin Mitochondrio", "Homo Mitochondrion", "Rattus Mitochondrio", "Mus Mitochondrion"]
eukaryota_chloroplast = ["Brana Chloroplast", "Pisum Chloroplast", "Wheat Chloroplast", "Pea Chloroplast", "Spiol Chloroplast", "Arabidopsis"]
eukaryota_fungi = ["Pichia", "Candida", "Aspergillus", "Kluyveromyces", "Saccharomyces","Schizosaccharomyces","Neosartorya"]
# construction of palette
trx_palette = {"OTHERS" : "#c7c7c7", "EUKARYOTA" : "#31955d", "CHLOROPLAST" : "#06f9d4", "MITOCHONDRION" : "#3ff100" , "root": "#000000", "AECA" : "#e7e53c",
"Zea": "#c7c7c7", "Vitis":"#c7c7c7", "Limonium": "#c7c7c7"}
trx_palette.update(dict.fromkeys(eukaryota_animal,"#21e548"))
trx_palette.update(dict.fromkeys(eukaryota_viridiplantae,"#b5e521"))
trx_palette.update(dict.fromkeys(eukaryota_mitochondrion, "#61ff06"))
trx_palette.update(dict.fromkeys(eukaryota_chloroplast,"#06f9d4"))
trx_palette.update(dict.fromkeys(eukaryota_fungi,"#1fb995"))
trx_palette.update(dict.fromkeys(bacteria_specie,"#34b8e7"))
trx_palette.update(dict.fromkeys(archae_specie, "#ff0000"))
#path_embedding = os.getcwd() + "/data/kinases_out0.9/batchsize4_epochs1000/PM5sigma=1.00gamma=3.00cosinepca=0_seed0.csv"
#path_annotation = os.getcwd() + "/data/kinase_group_new.csv"
df = read_embeddings(path_embedding, path_annotation, withroot=False)
result: pm1 pm2 Unnamed: 0 Species Kingdom \
proteins_id
1 0.183893 0.108609 0 LBCA LBCA
2 0.111038 -0.141715 1 LACA LACA
3 0.050593 -0.098363 2 AECA AECA
4 0.313914 0.437660 3 LGPCA LGPCA
5 -0.633037 -0.001717 4 LECA LECA
... ... ... ... ... ...
206 0.868353 -0.141709 205 Escherichia Bacteria
207 0.860335 -0.148330 206 Shigella Bacteria
208 0.847628 -0.142849 207 Salmonella Bacteria
209 0.865296 -0.160147 208 Yersinia Bacteria
210 0.859967 -0.116851 209 Vibrio Bacteria
Family Color_eukaryota
proteins_id
1 LBCA OTHERS
2 LACA OTHERS
3 AECA OTHERS
4 LGPCA OTHERS
5 LECA OTHERS
... ... ...
206 Proteobacteria OTHERS
207 Proteobacteria OTHERS
208 Proteobacteria OTHERS
209 Proteobacteria OTHERS
210 Proteobacteria OTHERS
[210 rows x 7 columns]
Here follow several examples of kinase family visualization.
trace1 = plot_embedding_interactive(df,
labels_name = 'Kingdom',#'1_Group',#'2_Gene',
show_text=True,
color_palette = trx_palette,
title = "Poincaré Map projection of thioredoxins colored by kingdom of life",
fontsize = 10,
labels_text= ['LBCA', 'LACA', 'AECA', 'LGPCA', 'LECA', 'LPBCA', 'LAFCA'],
#second_labels_name = "2_Gene",
#labels_text = ["RPS6KA1_1", "RPS6KA2_1", "RPS6KA5_1", "RPS6KB2", "RPS6KA1_2", "RPS6KA2_2", "RPS6KA3_2", "RPS6KA5_2"] # some sequences of AGC first domain and CAMK second domain (mentionned in the article) to label
#labels_text = ["CLK3_HUMAN", "SRPK3_HUMAN", "HIPK1_HUMAN","CSK22_HUMAN"] # some CMGC kinase
)
#trace1.write_image(path_to_figures+"/Thioredoxins_by_kingdom.pdf")
trace1.show()
Index(['Unnamed: 0', 'Species', 'Kingdom', 'Family', 'Color_eukaryota'], dtype='object')
trace2 = plot_embedding_interactive(df,
labels_name = 'Species',#'1_Group',#'2_Gene',
show_text=True,
color_palette = trx_palette,
title = "Poincaré Map projection of thioredoxins colored by species",
fontsize = 10,
labels_text= ['LBCA', 'LACA', 'AECA', 'LGPCA', 'LECA', 'LPBCA', 'LAFCA'],
#second_labels_name = "2_Gene",
#labels_text = ["RPS6KA1_1", "RPS6KA2_1", "RPS6KA5_1", "RPS6KB2", "RPS6KA1_2", "RPS6KA2_2", "RPS6KA3_2", "RPS6KA5_2"] # some sequences of AGC first domain and CAMK second domain (mentionned in the article) to label
#labels_text = ["CLK3_HUMAN", "SRPK3_HUMAN", "HIPK1_HUMAN","CSK22_HUMAN"] # some CMGC kinase
)
#trace2.write_image(path_to_figures+"/Thioredoxins_by_species.pdf")
trace2.show()
Index(['Unnamed: 0', 'Species', 'Kingdom', 'Family', 'Color_eukaryota'], dtype='object')